library(ngsReports)
library(edgeR)
library(magrittr)
library(tidyverse)
library(ggrepel)
theme_set(theme_bw())
rawFqc <- list.files("../0_rawData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
trimFqc <- list.files("../1_trimmedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
alnFqc <- list.files("../2_alignedData/FastQC/", pattern = "zip$", full.names = TRUE) %>%
getFastqcData()
rt <- list(
readTotals(rawFqc) %>% mutate(Stage = "Raw"),
readTotals(trimFqc) %>% mutate(Stage = "Trimmed"),
readTotals(alnFqc) %>% mutate(Stage = "Aligned")
) %>%
bind_rows() %>%
mutate(Sample = str_remove_all(Filename, "(_R1.fq.gz|Aligned.sortedByCoord.out.bam)"),
Sample = factor(Sample, levels = str_sort(unique(Sample), numeric = TRUE)),
Stage = factor(Stage, levels = c("Raw", "Trimmed", "Aligned")))
Summary of library sizes after each step. The slight increase after alignments indicates a low level of multiple alignments
Gene-level counts were loaded without filtering and an MDS plot generated in order to check whether replicate libraries grouped correctly.
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_names(basename(colnames(.))) %>%
set_names(str_remove_all(colnames(.), "Aligned.sortedByCoord.out.bam"))
dge <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dge$samples %<>%
rownames_to_column("library") %>%
mutate(Genotype = str_extract_all(library, "(FAD|FS|__)"),
Genotype = str_replace_all(Genotype, "__", "WT"),
Genotype = factor(Genotype, levels = c("WT", "FS", "FAD")),
Sample = str_replace(library, "__-", "_WT"),
Sample = ifelse(str_count(Sample, "_") == 3, Sample, paste0(Sample, "_1")),
Replicate = str_extract(Sample, "[12]$"),
Sample = str_remove(Sample, "_[12]$"),
group = as.integer(Genotype)) %>%
column_to_rownames("library")
mds <- plotMDS(dge, plot = FALSE) %>%
extract2("cmdscale.out") %>%
set_colnames(paste0("Dim", 1:2)) %>%
as.data.frame() %>%
rownames_to_column("Library") %>%
as_tibble()
Plot of replicate libraries showing good concordance between replicates. These will be combined and low-expressed genes removed for the DE analysis